{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-12T11:57:08.760201Z",
     "start_time": "2018-07-12T11:57:08.755207Z"
    }
   },
   "outputs": [],
   "source": [
    "import jieba\n",
    "from gensim import corpora\n",
    "from gensim import models\n",
    "from gensim import similarities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-12T11:57:09.991492Z",
     "start_time": "2018-07-12T11:57:09.140981Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\wanzheng\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 0.840 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['你', '今年', '多大', '了']\n"
     ]
    }
   ],
   "source": [
    "l1 = [\"你的名字是什么\", \"你今年几岁了\", \"你有多高你胸多大\", \"你胸多大\"]  # 标准问题\n",
    "a = \"你今年多大了\"  # 我们的自然语言\n",
    "\n",
    "s = jieba.cut(a)\n",
    "print(list(s))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 创建语料库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-12T11:57:13.711386Z",
     "start_time": "2018-07-12T11:57:13.706387Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['你', '的', '名字', '是', '什么'], ['你', '今年', '几岁', '了'], ['你', '有', '多', '高', '你', '胸多大'], ['你', '胸多大']]\n"
     ]
    }
   ],
   "source": [
    "# 制作语料库\n",
    "all_doc_list = [list(jieba.cut(doc)) for doc in l1]\n",
    "print(all_doc_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-12T11:57:14.261067Z",
     "start_time": "2018-07-12T11:57:14.253071Z"
    },
    "code_folding": [],
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "token2id {'什么': 0, '你': 1, '名字': 2, '是': 3, '的': 4, '了': 5, '今年': 6, '几岁': 7, '多': 8, '有': 9, '胸多大': 10, '高': 11}\n",
      "dictionary Dictionary(12 unique tokens: ['什么', '你', '名字', '是', '的']...)\n"
     ]
    }
   ],
   "source": [
    "# 待匹配文本\n",
    "doc_test_list = list(jieba.cut(a))\n",
    "\n",
    "# 制作词袋\n",
    "dictionary = corpora.Dictionary(all_doc_list)  \n",
    "# 词袋就是将很多很多的词,进行排列形成一个 词(key) 与一个 标志位(value) 的字典\n",
    "print(\"token2id\", dictionary.token2id)\n",
    "print(\"dictionary\", dictionary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-12T11:57:17.287394Z",
     "start_time": "2018-07-12T11:57:17.279401Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(1, 1), (5, 1), (6, 1), (7, 1)], [(1, 2), (8, 1), (9, 1), (10, 1), (11, 1)], [(1, 1), (10, 1)]] <class 'list'>\n"
     ]
    }
   ],
   "source": [
    "corpus = [dictionary.doc2bow(doc) for doc in all_doc_list]\n",
    "# 语料库:\n",
    "# 这里是将all_doc_list 中的每一个列表中的词语 与 dictionary 中的Key进行匹配\n",
    "# 得到一个匹配后的结果,例如['你', '今年', '几岁', '了']\n",
    "# 就可以得到 [[(1, 1), (5, 1), (6, 1), (7, 1)],[123,123,123,123,]]\n",
    "# 1代表的的是 你 1代表出现一次, 5代表的是 了  1代表出现了一次, 以此类推 6 = 今年 , 7 = 几岁\n",
    "print(corpus, type(corpus))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-12T11:57:20.245151Z",
     "start_time": "2018-07-12T11:57:20.239155Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "doc_test_vec [(1, 1), (5, 1), (6, 1)] <class 'list'>\n"
     ]
    }
   ],
   "source": [
    "#将需要寻找相似度的分词列表 做成 语料库 doc_test_vec\n",
    "doc_test_vec = dictionary.doc2bow(doc_test_list)\n",
    "print(\"doc_test_vec\", doc_test_vec, type(doc_test_vec))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 创建Lsi模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-12T11:59:02.285701Z",
     "start_time": "2018-07-12T11:59:02.270721Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "lsi LsiModel(num_terms=12, num_topics=200, decay=1.0, chunksize=20000) <class 'gensim.models.lsimodel.LsiModel'>\n",
      "lsi[corpus] <gensim.interfaces.TransformedCorpus object at 0x0000019F1DB2AAC8>\n"
     ]
    }
   ],
   "source": [
    "#将corpus语料库(初识语料库) 使用Lsi模型进行训练\n",
    "lsi = models.LsiModel(corpus)\n",
    "\n",
    "# 这里的只是需要学习Lsi模型来了解的,这里不做阐述\n",
    "print(\"lsi\", lsi, type(lsi))\n",
    "\n",
    "# 语料库corpus的训练结果\n",
    "print(\"lsi[corpus]\", lsi[corpus] )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-12T11:59:02.638500Z",
     "start_time": "2018-07-12T11:59:02.633501Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "lsi[doc_test_vec]---- [(0, 0.9910312948854691), (1, 0.06777365757875868), (2, 1.143786647872063), (3, 0.015934342901802574)]\n"
     ]
    }
   ],
   "source": [
    "# 获得语料库doc_test_vec 在 语料库corpus的训练结果 中的 向量表示\n",
    "print(\"lsi[doc_test_vec]----\", lsi[doc_test_vec])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 创建稀疏矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-12T11:59:03.967731Z",
     "start_time": "2018-07-12T11:59:03.960735Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "index: <gensim.similarities.docsim.SparseMatrixSimilarity object at 0x0000019F1DB2AB38> \n",
      "\n"
     ]
    }
   ],
   "source": [
    "# 文本相似度\n",
    "# 稀疏矩阵相似度 将 主 语料库corpus的训练结果 作为初始值\n",
    "index = similarities.SparseMatrixSimilarity(\n",
    "    lsi[corpus], num_features=len(dictionary.keys()))\n",
    "print(\"index:\", index,'\\n')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 查看待匹配文本在稀疏矩阵的表示"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-12T11:59:04.779264Z",
     "start_time": "2018-07-12T11:59:04.771269Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sim [0.29518965 0.9900962  0.46673587 0.46673578] <class 'numpy.ndarray'>\n"
     ]
    }
   ],
   "source": [
    "# 将 语料库doc_test_vec 在 语料库corpus的训练结果 中的 向量表示 与 语料库corpus的 向量表示 做矩阵相似度计算\n",
    "sim = index[lsi[doc_test_vec]]\n",
    "print(\"sim\", sim, type(sim))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 查看相似度最匹配的文本"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-12T11:59:06.797112Z",
     "start_time": "2018-07-12T11:59:06.786120Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(1, 0.9900962), (2, 0.46673587), (3, 0.46673578), (0, 0.29518965)]\n",
      "你今年多大了 ------- 你今年几岁了\n",
      "相似度： 0.9900962\n"
     ]
    }
   ],
   "source": [
    "# 对下标和相似度结果进行一个排序,拿出相似度最高的结果\n",
    "# cc = sorted(enumerate(sim), key=lambda item: item[1],reverse=True)\n",
    "cc = sorted(enumerate(sim), key=lambda item: -item[1])\n",
    "print(cc)\n",
    "high_score = cc[0]\n",
    "text = l1[high_score[0]]\n",
    "\n",
    "print(a,'-------', text)\n",
    "print('相似度：', high_score[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 创建tfidf模型（作比较）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-12T11:57:32.620404Z",
     "start_time": "2018-07-12T11:57:32.614407Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tfidf TfidfModel(num_docs=4, num_nnz=16)\n",
      "tfidf[corpus] <gensim.interfaces.TransformedCorpus object at 0x0000019F1DB2A668>\n"
     ]
    }
   ],
   "source": [
    "#将corpus语料库(初识语料库) 使用Tfidf模型进行训练\n",
    "tfidf = models.TfidfModel(corpus)\n",
    "\n",
    "print(\"tfidf\", tfidf)\n",
    "\n",
    "# 语料库corpus的训练结果\n",
    "print(\"tfidf[corpus]\", tfidf[corpus] )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-12T11:57:36.591261Z",
     "start_time": "2018-07-12T11:57:36.586261Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tfidf[doc_test_vec]---- [(5, 0.7071067811865475), (6, 0.7071067811865475)]\n"
     ]
    }
   ],
   "source": [
    "# 获得语料库doc_test_vec 在 语料库corpus的训练结果 中的 向量表示\n",
    "print(\"tfidf[doc_test_vec]----\", tfidf[doc_test_vec])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 稀疏矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-12T11:57:44.245951Z",
     "start_time": "2018-07-12T11:57:44.239952Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "index: <gensim.similarities.docsim.SparseMatrixSimilarity object at 0x0000019F1DB2A6D8> \n",
      "\n"
     ]
    }
   ],
   "source": [
    "# 稀疏矩阵相似度 将 主 语料库corpus的训练结果 作为初始值\n",
    "index = similarities.SparseMatrixSimilarity(\n",
    "    tfidf[corpus], num_features=len(dictionary.keys()))\n",
    "print(\"index:\", index,'\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-12T11:57:50.204632Z",
     "start_time": "2018-07-12T11:57:50.199634Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sim [0.         0.81649655 0.         0.        ] <class 'numpy.ndarray'>\n"
     ]
    }
   ],
   "source": [
    "# 将 语料库doc_test_vec 在 语料库corpus的训练结果 中的 向量表示 与 语料库corpus的 向量表示 做矩阵相似度计算\n",
    "sim = index[tfidf[doc_test_vec]]\n",
    "print(\"sim\", sim, type(sim))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 最相似"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-12T11:58:30.252262Z",
     "start_time": "2018-07-12T11:58:30.241268Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(1, 0.81649655), (0, 0.0), (2, 0.0), (3, 0.0)]\n",
      "你今年多大了 ------- 你今年几岁了\n",
      "相似度： 0.81649655\n"
     ]
    }
   ],
   "source": [
    "cc = sorted(enumerate(sim), key=lambda item: -item[1])\n",
    "print(cc)\n",
    "high_score = cc[0]\n",
    "text = l1[high_score[0]]\n",
    "\n",
    "print(a,'-------', text)\n",
    "print('相似度：', high_score[1])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": false,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "406px",
    "left": "928px",
    "top": "134.36px",
    "width": "243px"
   },
   "toc_section_display": false,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
